Load packages
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library("skimr")
## Warning: package 'skimr' was built under R version 3.5.2
library("here")
## here() starts at /Users/sjaganna/Desktop/CU-onedrive/OneDrive - The University of Colorado Denver/08-teaching/molb7910
This should install readr, tidyr, dplyr, purr, stringr, ggplot2, tibble, and forcats.
Set working directory
setwd(here())
In base R, we use read.table, read.csv, read.delim, etc. to import data. The problem with these functions is that they often coerce characters and other data types into factors. The tidyverse package readr guesses the data type of each column and converts types when appropriate (but does NOT convert strings to factors automatically).
We will be using readr exclusively to import data in this class. The typical commands used to input data using readr are below:
Using some of the following arguments makes it easier to control how we want to input the data.
Note: All of these functions can also be used in an interactive manner via Environment > Import Dataset > From Text (readr)
count_summary_rna <- read_delim(here("1-class1",
"data",
"count_summary_rna.tsv"),
delim = "\t",
escape_double = FALSE,
trim_ws = TRUE,
skip = 1)
## Parsed with column specification:
## cols(
## Geneid = col_character(),
## Chr = col_character(),
## Start = col_character(),
## End = col_character(),
## Strand = col_character(),
## Length = col_double(),
## `0h_1_RNA_S13_umi.bam` = col_double(),
## `0h_2_RNA_S14_umi.bam` = col_double(),
## `0h_3_RNA_S15_umi.bam` = col_double(),
## `14h_1_RNA_S22_umi.bam` = col_double(),
## `14h_2_RNA_S23_umi.bam` = col_double(),
## `14h_3_RNA_S24_umi.bam` = col_double(),
## `4h_1_RNA_S16_umi.bam` = col_double(),
## `4h_2_RNA_S17_umi.bam` = col_double(),
## `4h_3_RNA_S18_umi.bam` = col_double(),
## `8h_1_RNA_S19_umi.bam` = col_double(),
## `8h_2_RNA_S20_umi.bam` = col_double(),
## `8h_3_RNA_S21_umi.bam` = col_double()
## )
experiment_metadata <- read_delim(here("1-class1",
"data",
"experiment-metadata.txt"),
"\t",
escape_double = FALSE,
trim_ws = TRUE)
## Parsed with column specification:
## cols(
## `sample-name` = col_character(),
## hour = col_double(),
## treatment = col_character(),
## experiment = col_character()
## )
ensembl_to_genesymbol <- read_csv(here("1-class1",
"data",
"ensembl_to_genesymbol.csv"))
## Parsed with column specification:
## cols(
## `Ensembl Gene ID` = col_character(),
## `Ensembl Transcript ID` = col_character(),
## `Transcript length (including UTRs and CDS)` = col_double(),
## `CDS Length` = col_double(),
## `Associated Gene Name` = col_character()
## )
Some of the useful Data Frame Functions are as follows:
- head() - shows first 6 rows
- tail() - shows last 6 rows
- dim() - returns the dimensions of data frame (i.e. number of rows and number of columns)
- nrow() - number of rows
- ncol() - number of columns
- names() or colnames() - both show the names attribute for a data frame
- sapply(dataframe, class) - shows the class of each column in the data frame
- str() - structure of data frame - name, type and preview of data in each column - glimpse()
count_summary_rna #this works well with tbl_df data type. Others could take a LONG time
## # A tibble: 20,025 x 18
## Geneid Chr Start End Strand Length `0h_1_RNA_S13_u… `0h_2_RNA_S14_u…
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ENST0… chr1… 9446… 9448… -;-;-… 2265 243 322
## 2 ENST0… chr1… 9614… 9615… +;+;+… 1686 19 17
## 3 ENST0… chr1 1014… 1014… + 473 45 53
## 4 ENST0… chr1… 1041… 1041… +;+;+… 5060 42 50
## 5 ENST0… chr1… 1082… 1082… -;-;-… 1148 17 19
## 6 ENST0… chr1 1217… 1217… - 168 27.5 33.7
## 7 ENST0… chr1… 1217… 1217… -;-;-… 1070 158 170.
## 8 ENST0… chr1… 1217… 1217… -;-;-… 1028 148. 162.
## 9 ENST0… chr1 1232… 1233… + 866 11 21
## 10 ENST0… chr1… 1255… 1255… -;-;-… 625 27.3 23.8
## # ... with 20,015 more rows, and 10 more variables:
## # `0h_3_RNA_S15_umi.bam` <dbl>, `14h_1_RNA_S22_umi.bam` <dbl>,
## # `14h_2_RNA_S23_umi.bam` <dbl>, `14h_3_RNA_S24_umi.bam` <dbl>,
## # `4h_1_RNA_S16_umi.bam` <dbl>, `4h_2_RNA_S17_umi.bam` <dbl>,
## # `4h_3_RNA_S18_umi.bam` <dbl>, `8h_1_RNA_S19_umi.bam` <dbl>,
## # `8h_2_RNA_S20_umi.bam` <dbl>, `8h_3_RNA_S21_umi.bam` <dbl>
# display first six rows of a dataframe
head(count_summary_rna)
## # A tibble: 6 x 18
## Geneid Chr Start End Strand Length `0h_1_RNA_S13_u… `0h_2_RNA_S14_u…
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ENST0… chr1… 9446… 9448… -;-;-… 2265 243 322
## 2 ENST0… chr1… 9614… 9615… +;+;+… 1686 19 17
## 3 ENST0… chr1 1014… 1014… + 473 45 53
## 4 ENST0… chr1… 1041… 1041… +;+;+… 5060 42 50
## 5 ENST0… chr1… 1082… 1082… -;-;-… 1148 17 19
## 6 ENST0… chr1 1217… 1217… - 168 27.5 33.7
## # ... with 10 more variables: `0h_3_RNA_S15_umi.bam` <dbl>,
## # `14h_1_RNA_S22_umi.bam` <dbl>, `14h_2_RNA_S23_umi.bam` <dbl>,
## # `14h_3_RNA_S24_umi.bam` <dbl>, `4h_1_RNA_S16_umi.bam` <dbl>,
## # `4h_2_RNA_S17_umi.bam` <dbl>, `4h_3_RNA_S18_umi.bam` <dbl>,
## # `8h_1_RNA_S19_umi.bam` <dbl>, `8h_2_RNA_S20_umi.bam` <dbl>,
## # `8h_3_RNA_S21_umi.bam` <dbl>
# display last six rows of a dataframe
tail(count_summary_rna)
## # A tibble: 6 x 18
## Geneid Chr Start End Strand Length `0h_1_RNA_S13_u… `0h_2_RNA_S14_u…
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ENST0… chrX… 1550… 1550… +;+;+… 785 13.7 15.3
## 2 ENST0… chrX… 1550… 1550… +;+;+… 858 13.7 15.3
## 3 ENST0… chrX… 1552… 1552… +;+;+… 598 48 43.5
## 4 ENST0… chrX… 1552… 1552… +;+;+… 486 48 42.5
## 5 ENST0… chrX… 1554… 1554… -;-;-… 1269 15 13.5
## 6 ENST0… chrX… 1554… 1554… -;-;-… 1133 15 13.5
## # ... with 10 more variables: `0h_3_RNA_S15_umi.bam` <dbl>,
## # `14h_1_RNA_S22_umi.bam` <dbl>, `14h_2_RNA_S23_umi.bam` <dbl>,
## # `14h_3_RNA_S24_umi.bam` <dbl>, `4h_1_RNA_S16_umi.bam` <dbl>,
## # `4h_2_RNA_S17_umi.bam` <dbl>, `4h_3_RNA_S18_umi.bam` <dbl>,
## # `8h_1_RNA_S19_umi.bam` <dbl>, `8h_2_RNA_S20_umi.bam` <dbl>,
## # `8h_3_RNA_S21_umi.bam` <dbl>
#Compactly display the internal structure of an R object
str(count_summary_rna)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 20025 obs. of 18 variables:
## $ Geneid : chr "ENST00000327044.6_51_2298" "ENST00000338591.7_360_2034" "ENST00000379389.4_176_647" "ENST00000379370.6_1158_6186" ...
## $ Chr : chr "chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1" "chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1" "chr1" "chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c"| __truncated__ ...
## $ Start : chr "944696;945056;945517;946172;946401;948130;948489;951126;951999;952411;953174;953781;954003;955922;956094;956893"| __truncated__ "961438;961628;961825;962354;962703;963108;963336;963919;964106;964348;964962" "1014005" "1041633;1041955;1043238;1043537;1043822;1044108;1044333;1045160;1045358;1045732;1045963;1046159;1046396;1046819"| __truncated__ ...
## $ End : chr "944800;945146;945653;946286;946545;948232;948603;951238;952139;952600;953288;953892;954082;956013;956215;957025"| __truncated__ "961552;961750;962047;962471;962917;963253;963504;964008;964180;964530;965190" "1014477" "1041702;1042162;1043457;1043732;1044023;1044257;1044439;1045277;1045523;1045876;1046088;1046265;1046735;1046957"| __truncated__ ...
## $ Strand : chr "-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-" "+;+;+;+;+;+;+;+;+;+;+" "+" "+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+" ...
## $ Length : num 2265 1686 473 5060 1148 ...
## $ 0h_1_RNA_S13_umi.bam : num 243 19 45 42 17 ...
## $ 0h_2_RNA_S14_umi.bam : num 322 17 53 50 19 ...
## $ 0h_3_RNA_S15_umi.bam : num 303 15 48 52 25 ...
## $ 14h_1_RNA_S22_umi.bam: num 177 9 11 32 3 ...
## $ 14h_2_RNA_S23_umi.bam: num 177 5 5 31 0 ...
## $ 14h_3_RNA_S24_umi.bam: num 239 8 14 30 2 ...
## $ 4h_1_RNA_S16_umi.bam : num 283 10 37 43 17 ...
## $ 4h_2_RNA_S17_umi.bam : num 229 12 39 47 13 ...
## $ 4h_3_RNA_S18_umi.bam : num 251 19 46 47 21 ...
## $ 8h_1_RNA_S19_umi.bam : num 267 13 38 44 12 ...
## $ 8h_2_RNA_S20_umi.bam : num 257 18 36 44 15 ...
## $ 8h_3_RNA_S21_umi.bam : num 231 12 22 50 10 ...
## - attr(*, "spec")=
## .. cols(
## .. Geneid = col_character(),
## .. Chr = col_character(),
## .. Start = col_character(),
## .. End = col_character(),
## .. Strand = col_character(),
## .. Length = col_double(),
## .. `0h_1_RNA_S13_umi.bam` = col_double(),
## .. `0h_2_RNA_S14_umi.bam` = col_double(),
## .. `0h_3_RNA_S15_umi.bam` = col_double(),
## .. `14h_1_RNA_S22_umi.bam` = col_double(),
## .. `14h_2_RNA_S23_umi.bam` = col_double(),
## .. `14h_3_RNA_S24_umi.bam` = col_double(),
## .. `4h_1_RNA_S16_umi.bam` = col_double(),
## .. `4h_2_RNA_S17_umi.bam` = col_double(),
## .. `4h_3_RNA_S18_umi.bam` = col_double(),
## .. `8h_1_RNA_S19_umi.bam` = col_double(),
## .. `8h_2_RNA_S20_umi.bam` = col_double(),
## .. `8h_3_RNA_S21_umi.bam` = col_double()
## .. )
#Get A Glimpse Of Your Data
glimpse(count_summary_rna)
## Observations: 20,025
## Variables: 18
## $ Geneid <chr> "ENST00000327044.6_51_2298", "ENST0000...
## $ Chr <chr> "chr1;chr1;chr1;chr1;chr1;chr1;chr1;ch...
## $ Start <chr> "944696;945056;945517;946172;946401;94...
## $ End <chr> "944800;945146;945653;946286;946545;94...
## $ Strand <chr> "-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-...
## $ Length <dbl> 2265, 1686, 473, 5060, 1148, 168, 1070...
## $ `0h_1_RNA_S13_umi.bam` <dbl> 243.00, 19.00, 45.00, 42.00, 17.00, 27...
## $ `0h_2_RNA_S14_umi.bam` <dbl> 322.00, 17.00, 53.00, 50.00, 19.00, 33...
## $ `0h_3_RNA_S15_umi.bam` <dbl> 303.00, 15.00, 48.00, 52.00, 25.00, 36...
## $ `14h_1_RNA_S22_umi.bam` <dbl> 177.00, 9.00, 11.00, 32.00, 3.00, 22.5...
## $ `14h_2_RNA_S23_umi.bam` <dbl> 177.00, 5.00, 5.00, 31.00, 0.00, 29.17...
## $ `14h_3_RNA_S24_umi.bam` <dbl> 239.00, 8.00, 14.00, 30.00, 2.00, 27.3...
## $ `4h_1_RNA_S16_umi.bam` <dbl> 283.00, 10.00, 37.00, 43.00, 17.00, 31...
## $ `4h_2_RNA_S17_umi.bam` <dbl> 229.00, 12.00, 39.00, 47.00, 13.00, 40...
## $ `4h_3_RNA_S18_umi.bam` <dbl> 251.00, 19.00, 46.00, 47.00, 21.00, 23...
## $ `8h_1_RNA_S19_umi.bam` <dbl> 267.00, 13.00, 38.00, 44.00, 12.00, 33...
## $ `8h_2_RNA_S20_umi.bam` <dbl> 257.00, 18.00, 36.00, 44.00, 15.00, 36...
## $ `8h_3_RNA_S21_umi.bam` <dbl> 231.00, 12.00, 22.00, 50.00, 10.00, 29...
glimpse(experiment_metadata)
## Observations: 12
## Variables: 4
## $ `sample-name` <chr> "0h_1_RNA_S13_umi.bam", "0h_2_RNA_S14_umi.bam", ...
## $ hour <dbl> 0, 0, 0, 4, 4, 4, 8, 8, 8, 14, 14, 14
## $ treatment <chr> "doxcycyline", "doxcycyline", "doxcycyline", "do...
## $ experiment <chr> "rnaseq", "rnaseq", "rnaseq", "rnaseq", "rnaseq"...
A generic function used to produce result summaries of the results of various model fitting functions.
summary(count_summary_rna)
## Geneid Chr Start
## Length:20025 Length:20025 Length:20025
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## End Strand Length
## Length:20025 Length:20025 Min. : 9
## Class :character Class :character 1st Qu.: 660
## Mode :character Mode :character Median : 1235
## Mean : 1635
## 3rd Qu.: 2053
## Max. :108334
## 0h_1_RNA_S13_umi.bam 0h_2_RNA_S14_umi.bam 0h_3_RNA_S15_umi.bam
## Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 13.0 1st Qu.: 14.67 1st Qu.: 14.0
## Median : 38.0 Median : 42.00 Median : 39.5
## Mean : 120.8 Mean : 133.27 Mean : 125.1
## 3rd Qu.: 100.8 3rd Qu.: 111.00 3rd Qu.: 103.5
## Max. :24374.0 Max. :29852.00 Max. :27929.0
## 14h_1_RNA_S22_umi.bam 14h_2_RNA_S23_umi.bam 14h_3_RNA_S24_umi.bam
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 7.00 1st Qu.: 6.00 1st Qu.: 7.33
## Median : 27.00 Median : 23.00 Median : 27.57
## Mean : 119.78 Mean : 99.32 Mean : 116.78
## 3rd Qu.: 91.67 3rd Qu.: 75.00 3rd Qu.: 89.00
## Max. :26483.50 Max. :23692.50 Max. :24847.50
## 4h_1_RNA_S16_umi.bam 4h_2_RNA_S17_umi.bam 4h_3_RNA_S18_umi.bam
## Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 12.5 1st Qu.: 12.17 1st Qu.: 11.5
## Median : 37.5 Median : 36.32 Median : 34.0
## Mean : 121.9 Mean : 117.99 Mean : 109.9
## 3rd Qu.: 100.8 3rd Qu.: 97.67 3rd Qu.: 92.0
## Max. :28571.0 Max. :27655.00 Max. :25130.0
## 8h_1_RNA_S19_umi.bam 8h_2_RNA_S20_umi.bam 8h_3_RNA_S21_umi.bam
## Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.: 10.33 1st Qu.: 11.0 1st Qu.: 10.25
## Median : 33.33 Median : 35.5 Median : 33.50
## Mean : 120.41 Mean : 125.1 Mean : 118.88
## 3rd Qu.: 95.00 3rd Qu.: 100.5 3rd Qu.: 95.50
## Max. :27141.00 Max. :27735.0 Max. :24991.00
summary(count_summary_rna$`0h_1_RNA_S13_umi.bam`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 13.0 38.0 120.8 100.8 24374.0
An alternative to summary is skim.
library(skimr)
skim(count_summary_rna)
## Skim summary statistics
## n obs: 20025
## n variables: 18
##
## ── Variable type:character ─────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## Chr 0 20025 20025 4 1809 0 1050
## End 0 20025 20025 6 3619 0 19249
## Geneid 0 20025 20025 22 29 0 20025
## Start 0 20025 20025 6 3619 0 19283
## Strand 0 20025 20025 1 723 0 165
##
## ── Variable type:numeric ───────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## 0h_1_RNA_S13_umi.bam 0 20025 20025 120.81 538.88 0 13
## 0h_2_RNA_S14_umi.bam 0 20025 20025 133.27 603.79 0 14.67
## 0h_3_RNA_S15_umi.bam 0 20025 20025 125.13 569.79 0 14
## 14h_1_RNA_S22_umi.bam 0 20025 20025 119.78 478.79 0 7
## 14h_2_RNA_S23_umi.bam 0 20025 20025 99.32 418.02 0 6
## 14h_3_RNA_S24_umi.bam 0 20025 20025 116.78 475.03 0 7.33
## 4h_1_RNA_S16_umi.bam 0 20025 20025 121.95 536.99 0 12.5
## 4h_2_RNA_S17_umi.bam 0 20025 20025 117.99 526 0 12.17
## 4h_3_RNA_S18_umi.bam 0 20025 20025 109.89 482.37 0 11.5
## 8h_1_RNA_S19_umi.bam 0 20025 20025 120.41 512.3 0 10.33
## 8h_2_RNA_S20_umi.bam 0 20025 20025 125.1 521.28 0 11
## 8h_3_RNA_S21_umi.bam 0 20025 20025 118.88 496.42 0 10.25
## Length 0 20025 20025 1634.98 2204.56 9 660
## p50 p75 p100 hist
## 38 100.83 24374 ▇▁▁▁▁▁▁▁
## 42 111 29852 ▇▁▁▁▁▁▁▁
## 39.5 103.5 27929 ▇▁▁▁▁▁▁▁
## 27 91.67 26483.5 ▇▁▁▁▁▁▁▁
## 23 75 23692.5 ▇▁▁▁▁▁▁▁
## 27.57 89 24847.5 ▇▁▁▁▁▁▁▁
## 37.5 100.83 28571 ▇▁▁▁▁▁▁▁
## 36.32 97.67 27655 ▇▁▁▁▁▁▁▁
## 34 92 25130 ▇▁▁▁▁▁▁▁
## 33.33 95 27141 ▇▁▁▁▁▁▁▁
## 35.5 100.5 27735 ▇▁▁▁▁▁▁▁
## 33.5 95.5 24991 ▇▁▁▁▁▁▁▁
## 1235 2053 108334 ▇▁▁▁▁▁▁▁
Uses the cross-classifying factors to build a contingency table of the counts at each combination of factor levels.
table(experiment_metadata$hour)
##
## 0 4 8 14
## 3 3 3 3
The two simplest plots to use to get a sense of your data are plot(x) and hist(x)
plot(count_summary_rna$`0h_1_RNA_S13_umi.bam`)
plot(count_summary_rna$`0h_1_RNA_S13_umi.bam`, count_summary_rna$`0h_2_RNA_S14_umi.bam`)
plot(count_summary_rna$`0h_1_RNA_S13_umi.bam`, count_summary_rna$`14h_1_RNA_S22_umi.bam`)
plot(log2(count_summary_rna$`0h_1_RNA_S13_umi.bam`), log2(count_summary_rna$`14h_1_RNA_S22_umi.bam`))
plot(log2(count_summary_rna$`0h_1_RNA_S13_umi.bam`), log2(count_summary_rna$`0h_2_RNA_S14_umi.bam`))
hist(count_summary_rna$`0h_1_RNA_S13_umi.bam`)
hist(log2(count_summary_rna$`0h_1_RNA_S13_umi.bam`))
What is tidy data?
“Tidy datasets are all alike but every messy dataset is messy in its own way” Hadley Wickham
The four verbs to keep in mind for reshaping data with tidyr are:
- spread
- gather
- separate
- unite
The verbs to keep in mind for transforming data with dplyr are:
- select
- rename - arrange
- filter
- mutate
- group_by - summarise - The pipe: %>%
data <- count_summary_rna %>%
select(-Chr, -Start, -End, -Strand)
names(data)
## [1] "Geneid" "Length"
## [3] "0h_1_RNA_S13_umi.bam" "0h_2_RNA_S14_umi.bam"
## [5] "0h_3_RNA_S15_umi.bam" "14h_1_RNA_S22_umi.bam"
## [7] "14h_2_RNA_S23_umi.bam" "14h_3_RNA_S24_umi.bam"
## [9] "4h_1_RNA_S16_umi.bam" "4h_2_RNA_S17_umi.bam"
## [11] "4h_3_RNA_S18_umi.bam" "8h_1_RNA_S19_umi.bam"
## [13] "8h_2_RNA_S20_umi.bam" "8h_3_RNA_S21_umi.bam"
data <- data %>% rename(
hour00_rep1 = `0h_1_RNA_S13_umi.bam`,
hour00_rep2 = `0h_2_RNA_S14_umi.bam`,
hour00_rep3 = `0h_3_RNA_S15_umi.bam`,
hour04_rep1 = `4h_1_RNA_S16_umi.bam`,
hour04_rep2 = `4h_2_RNA_S17_umi.bam`,
hour04_rep3 = `4h_3_RNA_S18_umi.bam`,
hour08_rep1 = `8h_1_RNA_S19_umi.bam`,
hour08_rep2 = `8h_2_RNA_S20_umi.bam`,
hour08_rep3 = `8h_3_RNA_S21_umi.bam`,
hour14_rep1 = `14h_1_RNA_S22_umi.bam`,
hour14_rep2 = `14h_2_RNA_S23_umi.bam`,
hour14_rep3 = `14h_3_RNA_S24_umi.bam`)
## Using dplyr
# data <- data %>% select(Geneid, Length, hour00_rep1, hour00_rep2, hour00_rep3, hour04_rep1, hour04_rep2, hour04_rep3, hour08_rep1, hour08_rep2, hour08_rep3, hour14_rep1, hour14_rep2, hour14_rep3)
## Alternative with baseR
data <- data[, c(1:5, 9:14, 6:8)]
data <- data %>% arrange(hour14_rep1)
data <- data %>% arrange(desc(hour14_rep1))
data_sub <- data %>% filter(hour14_rep1 > 2000)
Example: Log transformation or sum of multiple columns
data_sum <- data %>% mutate(sum_hour00 = sum(hour00_rep1, hour00_rep2, hour00_rep3))
join to map ensembl IDs to gene_symbols In order to merge the dataframe, we need to first get the Ensembl id in our dataframe to match the Ensembl id in the mapping file. Let’s take a look at these.
head(data$Geneid)
## [1] "ENST00000550420.2_116_632" "ENST00000544301.6_414_1812"
## [3] "ENST00000550420.2_722_1007" "ENST00000370651.5_1154_1673"
## [5] "ENST00000357726.4_28_1477" "ENST00000318203.9_698_1997"
head(ensembl_to_genesymbol$`Ensembl Transcript ID`)
## [1] "ENST00000361390" "ENST00000361453" "ENST00000361624" "ENST00000361739"
## [5] "ENST00000361851" "ENST00000361899"
The Ensembl transcript id in our data table has trailing characters that need to be removed. This is an example of “string manipulation”. stringr is a very powerful package for string manipulation in R that would take an entire class (at least) to go through. Instead, we are going to use tidyr’s separate to accomplish this task.
data <- data %>% separate(Geneid, sep = "\\.", into = c("Geneid", "extra"))
head(data$Geneid)
## [1] "ENST00000550420" "ENST00000544301" "ENST00000550420" "ENST00000370651"
## [5] "ENST00000357726" "ENST00000318203"
It looks like we have the Geneid in the form we need for merging. Now would be a good time to also get the column names to match to enable easy merging.
names(ensembl_to_genesymbol)
## [1] "Ensembl Gene ID"
## [2] "Ensembl Transcript ID"
## [3] "Transcript length (including UTRs and CDS)"
## [4] "CDS Length"
## [5] "Associated Gene Name"
ensembl_to_genesymbol <- ensembl_to_genesymbol %>% rename(
ensembl_gene = `Ensembl Gene ID`,
ensembl_transcript = `Ensembl Transcript ID`,
transcript_length = `Transcript length (including UTRs and CDS)`,
cds_length = `CDS Length`,
gene_symbol = `Associated Gene Name`)
# Remove unnecessary columns
ensembl_to_genesymbol <- ensembl_to_genesymbol %>% select(ensembl_transcript, gene_symbol)
names(data)
## [1] "Geneid" "extra" "Length" "hour00_rep1" "hour00_rep2"
## [6] "hour00_rep3" "hour04_rep1" "hour04_rep2" "hour04_rep3" "hour08_rep1"
## [11] "hour08_rep2" "hour08_rep3" "hour14_rep1" "hour14_rep2" "hour14_rep3"
data <- data %>% rename(
ensembl_transcript = Geneid)
data_genelevel <- left_join(data, ensembl_to_genesymbol)
## Joining, by = "ensembl_transcript"
# remove unnecessary columns
data_genelevel <- data_genelevel %>% select (-extra, -Length, -ensembl_transcript)
# rearrange to get gene_symbol first
data_genelevel <- data_genelevel %>% select (gene_symbol, everything())
names(data_genelevel)
## [1] "gene_symbol" "hour00_rep1" "hour00_rep2" "hour00_rep3" "hour04_rep1"
## [6] "hour04_rep2" "hour04_rep3" "hour08_rep1" "hour08_rep2" "hour08_rep3"
## [11] "hour14_rep1" "hour14_rep2" "hour14_rep3"
# check number of transcripts vs. number of genes
length(data_genelevel$gene_symbol)
## [1] 20025
length(unique(data_genelevel$gene_symbol))
## [1] 10568
# group_by genes and summarise counts
nrow(data_genelevel)
## [1] 20025
data_genelevel <- data_genelevel %>%
group_by(gene_symbol) %>%
summarise(hour00_rep1 = sum(hour00_rep1),
hour00_rep2 = sum(hour00_rep2),
hour00_rep3 = sum(hour00_rep3),
hour04_rep1 = sum(hour04_rep1),
hour04_rep2 = sum(hour04_rep2),
hour04_rep3 = sum(hour04_rep3),
hour08_rep1 = sum(hour08_rep1),
hour08_rep2 = sum(hour08_rep2),
hour08_rep3 = sum(hour08_rep3),
hour14_rep1 = sum(hour14_rep1),
hour14_rep2 = sum(hour14_rep2),
hour14_rep3 = sum(hour14_rep3))
nrow(data_genelevel)
## [1] 10568
TBD